TL;DR

The goal of this document is to look at residuals and see if we could use the residuals to look at some sort of adjusted “counts” (residuals can be negative, so we should probably find another name than “counts”). First, I fitted zinbwave without the batches in the X, then I included the batches in the X. For each scenario, I looked at naive, naive on the log scale, Pearson, and deviance residuals. I plotted the PCA of the residuals with two coloring (batch and clusters) and boxplots of the residuals for each cell.

Unnormalized data

I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.

load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")

Here we only look at the 1000 most variable genes.

names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.

dim(core)
## [1] 1000  616
core[1:3, 1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1             8763            7221            3581
## Cbr2              5799            3638            1448
## Cyp2f2            2158            2027            1078

Cells have been processed in 18 different batches

table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B    P01    P02   P03A   P03B    P04    P05 
##     36     33     30     17     25     43     37     33     14     20 
##    P06    P10    P11    P12    P13    P14    Y01    Y04 
##     39     36     44     40     47     39     51     32

Cells have been clustered into 13 different clusters

table(clus.labels)
## clus.labels
##  1  2  3  4  5  7  8  9 10 11 12 14 15 
## 91 25 56 40 96 60 28 79 26 22 35 26 32

Batches are kind of confounded with the biology

table(data.frame(batch = as.vector(batch),
                 cluster = clus.labels))
##         cluster
## batch     1  2  3  4  5  7  8  9 10 11 12 14 15
##   GBC08A  0  2 12  9  0  0  0  0  0  2  0  2  9
##   GBC08B  0  7  5  3  0  0  0  1  2  4  0  5  6
##   GBC09A  0  1  5  9  0  0  0  1  1  0  0  6  7
##   GBC09B  0  2  2  7  0  0  0  3  0  0  0  3  0
##   P01     0  2  4  3 15  1  0  0  0  0  0  0  0
##   P02     2  0  9  3 15  3  3  2  3  0  2  1  0
##   P03A    3  0  3  0 12  2  9  4  2  0  2  0  0
##   P03B    1  2  1  1 11  1  2 10  1  1  2  0  0
##   P04     0  0  0  0 11  1  0  1  1  0  0  0  0
##   P05     0  0  0  1 11  3  0  1  0  2  2  0  0
##   P06     1  2  3  0  8  2  4  8  4  1  2  2  2
##   P10     3  1  4  0  4  5  9  2  0  2  5  0  1
##   P11     2  1  1  0  1  5  1 22  3  1  6  0  1
##   P12     0  2  0  0  4 10  0  8  2  3  6  4  1
##   P13     1  2  4  0  4 15  0  4  5  6  1  3  2
##   P14     0  0  1  2  0 12  0 12  2  0  7  0  3
##   Y01    47  1  1  2  0  0  0  0  0  0  0  0  0
##   Y04    31  0  1  0  0  0  0  0  0  0  0  0  0
plotBoxplot <- function(y, main = '', col, log = F){
  if (log) y = log2(y + 1)
  #median = apply(y, 1, median)
  #rle = apply(y, 2, function(x) x - median)
  boxplot(y, main=main, col=col, staplewex=0, outline=0, 
          border=col, xaxt='n')
  abline(h=0)
}

compute.naive.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  Y - Y_hat 
}

compute.log.naive.residuals <- function(Y, zinb){
  mu = t(getMu(zinb)) 
  pi = t(getPi(zinb)) 
  phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
  var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
  Y_hat = (1 - pi) * mu
  log_Y_hat_plus_1 = log(1 + Y_hat) - var_hat / (2*(1 + Y_hat)^2)
  log(Y + 1) - log_Y_hat_plus_1
}


compute.pearson.residuals <- function(Y, zinb){
  num = compute.naive.residuals(Y, zinb)
  mu = t(getMu(zinb)) 
  pi = t(getPi(zinb)) 
  phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
  var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
  num / sqrt(var_hat)  
}

compute.zinb.loglik <- function(Y, zinb){
  mu = t(getMu(zinb)) 
  theta = getTheta(zinb)
  theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
  pi = t(getPi(zinb))
  log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}

compute.deviance.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  ll = compute.zinb.loglik(Y, zinb)
  sign = as.numeric(Y - Y_hat > 0)
  sign[sign == 0] = -1
  sign * sqrt(-2 * ll)
}

X intercept, no batch

Let’s run zinbwave with K = 0 and X with an intercept only (no batch).

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 0)))
##    user  system elapsed 
## 141.475  26.295  79.481

Naive residuals

Rn = compute.naive.residuals(core, zinb)
Rn[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1       3849.11026      2338.18336       -308.9602
## Cbr2        3326.65478      1193.04910       -490.0451
## Cyp2f2        36.88932       -74.65741       -591.2817

PCA (centered not scale) of residuals, first colored by batch (left), then by clusters (right). On the left side, we should not see patterns whereas on the right hand side we should.

pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))

Each boxplot is for a cell. Colors correspond to the batches. When colors are the same but boxplots are not next to each others it corresponds to different batches. We would expect that the residuals look similar for the different batches.

Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Naive residuals of log(Y+1)

Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1         2.256998        2.071413        1.599005
## Cbr2          2.575342        2.132643        1.456709
## Cyp2f2        1.721755        1.676310        1.283519
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Pearson residuals

Rp = compute.pearson.residuals(core, zinb)
Rp[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1      0.427421118      0.26116965     -0.04329637
## Cbr2       0.724533374      0.26181816     -0.13516322
## Cyp2f2     0.009414916     -0.01918576     -0.19083820
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Deviance residuals

Rd = compute.deviance.residuals(core, zinb)
Rd[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1         4.669326        4.620308       -4.460612
## Cbr2          4.601415        4.474531       -4.263009
## Cyp2f2        4.347795       -4.334302       -4.193403
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

X batches

Let’s run zinbwave with K = 0 and X with an intercept only (no batch).

mod = model.matrix( ~ batch)
print(system.time(zinb_batch <- zinbFit(core, ncores = 3, K = 0,
                                  X = mod)))
##    user  system elapsed 
## 256.122  36.898 132.360

Naive residuals

Rn = compute.naive.residuals(core, zinb_batch)
Rn[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1        2384.4457        610.1331       -1611.654
## Cbr2         1994.4479       -296.8525       -1636.557
## Cyp2f2        664.3703        482.0526        -133.216
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Naive residuals of log(Y+1)

Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1         2.256998        2.071413        1.599005
## Cbr2          2.575342        2.132643        1.456709
## Cyp2f2        1.721755        1.676310        1.283519
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Pearson residuals

Rp = compute.pearson.residuals(core, zinb_batch)
Rp[1:3,1:3]
##        OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## Gstm1        0.2207362      0.05449718     -0.18326782
## Cbr2         0.3078798     -0.04424473     -0.31074163
## Cyp2f2       0.2612865      0.18304143     -0.06443715
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Deviance residuals

Rd = compute.deviance.residuals(core, zinb_batch)
Rd[1:3,1:3]
##             1         2         3
## [1,] 4.637339  4.591452 -4.440573
## [2,] 4.552446 -4.441892 -4.248676
## [3,] 4.327764  4.310825 -4.159757
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

Conclusions

As batches are somehow confounded with the biology, I don’t know if we should adjust or not for the batches. For the residuals, it seems that the Pearson residuals when not adjusting for batches is the most informative (at least, when looking at PC 1 and 2).

Session Info

sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rARPACK_0.11-0             digest_0.6.12             
##  [3] RColorBrewer_1.1-2         scone_1.0.0               
##  [5] Rtsne_0.13                 magrittr_1.5              
##  [7] ggplot2_2.2.1              zinbwave_0.1.4            
##  [9] scRNAseq_1.2.0             SummarizedExperiment_1.6.1
## [11] DelayedArray_0.2.2         matrixStats_0.52.2        
## [13] Biobase_2.36.2             GenomicRanges_1.28.1      
## [15] GenomeInfoDb_1.12.0        IRanges_2.10.0            
## [17] S4Vectors_0.14.0           BiocGenerics_0.22.0       
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.0.5          copula_0.999-16         
##   [3] aroma.light_3.6.0        NMF_0.20.6              
##   [5] igraph_1.0.1             plyr_1.8.4              
##   [7] lazyeval_0.2.0           shinydashboard_0.5.3    
##   [9] splines_3.4.0            pspline_1.0-17          
##  [11] BiocParallel_1.10.1      scater_1.4.0            
##  [13] gridBase_0.4-7           foreach_1.4.3           
##  [15] htmltools_0.3.6          viridis_0.4.0           
##  [17] gdata_2.17.0             memoise_1.1.0           
##  [19] cluster_2.0.6            doParallel_1.0.10       
##  [21] mixtools_1.1.0           limma_3.32.2            
##  [23] Biostrings_2.44.0        annotate_1.54.0         
##  [25] bayesm_3.0-2             R.utils_2.5.0           
##  [27] stabledist_0.7-1         colorspace_1.3-2        
##  [29] dplyr_0.5.0              tximport_1.4.0          
##  [31] RCurl_1.95-4.8           jsonlite_1.4            
##  [33] hexbin_1.27.1            genefilter_1.58.1       
##  [35] zoo_1.8-0                survival_2.41-3         
##  [37] iterators_1.0.8          registry_0.3            
##  [39] gtable_0.2.0             zlibbioc_1.22.0         
##  [41] XVector_0.16.0           compositions_1.40-1     
##  [43] kernlab_0.9-25           prabclus_2.2-6          
##  [45] DEoptimR_1.0-8           scales_0.4.1            
##  [47] DESeq_1.28.0             mvtnorm_1.0-6           
##  [49] DBI_0.6-1                edgeR_3.18.1            
##  [51] rngtools_1.2.4           miniUI_0.1.1            
##  [53] Rcpp_0.12.10             viridisLite_0.2.0       
##  [55] xtable_1.8-2             mclust_5.2.3            
##  [57] DT_0.2                   glmnet_2.0-10           
##  [59] htmlwidgets_0.8          httr_1.2.1              
##  [61] FNN_1.1                  gplots_3.0.1            
##  [63] fpc_2.1-10               modeltools_0.2-21       
##  [65] XML_3.98-1.7             R.methodsS3_1.7.1       
##  [67] flexmix_2.3-14           nnet_7.3-12             
##  [69] dynamicTreeCut_1.63-1    locfit_1.5-9.1          
##  [71] softImpute_1.4           reshape2_1.4.2          
##  [73] AnnotationDbi_1.38.0     munsell_0.4.3           
##  [75] tools_3.4.0              visNetwork_1.0.3        
##  [77] RSQLite_1.1-2            evaluate_0.10           
##  [79] stringr_1.2.0            yaml_2.1.14             
##  [81] knitr_1.15.1             robustbase_0.92-7       
##  [83] caTools_1.17.1           purrr_0.2.2.2           
##  [85] EDASeq_2.10.0            mime_0.5                
##  [87] scran_1.4.2              R.oo_1.21.0             
##  [89] biomaRt_2.32.0           compiler_3.4.0          
##  [91] beeswarm_0.2.3           plotly_4.6.0            
##  [93] statmod_1.4.29           tibble_1.3.0            
##  [95] geneplotter_1.54.0       pcaPP_1.9-61            
##  [97] stringi_1.1.5            gsl_1.9-10.3            
##  [99] RSpectra_0.12-0          GenomicFeatures_1.28.0  
## [101] lattice_0.20-35          trimcluster_0.1-2       
## [103] Matrix_1.2-10            tensorA_0.36            
## [105] ADGofTest_0.3            data.table_1.10.4       
## [107] bitops_1.0-6             httpuv_1.3.3            
## [109] rtracklayer_1.36.0       R6_2.2.1                
## [111] latticeExtra_0.6-28      hwriter_1.3.2           
## [113] ShortRead_1.34.0         gridExtra_2.2.1         
## [115] KernSmooth_2.23-15       vipor_0.4.5             
## [117] codetools_0.2-15         boot_1.3-19             
## [119] energy_1.7-0             MASS_7.3-47             
## [121] gtools_3.5.0             assertthat_0.2.0        
## [123] rhdf5_2.20.0             rjson_0.2.15            
## [125] pkgmaker_0.22            rprojroot_1.2           
## [127] RUVSeq_1.10.0            GenomicAlignments_1.12.0
## [129] Rsamtools_1.28.0         GenomeInfoDbData_0.99.0 
## [131] diptest_0.75-7           grid_3.4.0              
## [133] tidyr_0.6.2              class_7.3-14            
## [135] rmarkdown_1.5            segmented_0.5-1.4       
## [137] numDeriv_2016.8-1        shiny_1.0.3             
## [139] ggbeeswarm_0.5.3